R Markdown

This is an initial report as a test to analyze the causality of variables of development projects (GitHub) and quality characteristics of the software (Sonar Cloud). The data has been obtained through the public api of both platforms, and the json data has been filtered and pre-processed using an intermediate Mongo documentary database.

The result of the preprocessing has been stored in a CSV file. The first step is to import the data from that file.

library(readr)
sonar_git <- read_csv("../data/sonar-git.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   project = col_character(),
##   version = col_character(),
##   from = col_datetime(format = ""),
##   to = col_datetime(format = ""),
##   file_complexity_distribution = col_character(),
##   function_complexity_distribution = col_character(),
##   alert_status = col_character()
## )
## See spec(...) for full column specifications.

Then, we filter our the matrix (with 107 variables) to manage those interesting (after some preliminary analyises were done). With the filtered data, we sown the descriptive statistics.

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dataset0<-select(sonar_git, 
                  project,
                  version,
                  commits,
                  committers,
                  changes_by_commit,
                  committers_weight,
                  bugs,
                  code_smells,
                  complexity,
                  violations,
                  duplicated_lines,
                  open_issues,
                  lines)

#Filter missing values 
dataset0 <- dataset0[-c(133:190, 172, 173, 195:205, 212:228, 233:245, 249:256), ]

dataset1 <- dataset0
dataset1 <- within(dataset1, commits <- commits/(lines/1000))
dataset1 <- within(dataset1, committers <- committers/(lines/1000))
dataset1 <- within(dataset1, commits <- commits/(lines/1000))
dataset1 <- within(dataset1, bugs <- bugs/(lines/1000))
dataset1 <- within(dataset1, code_smells <- code_smells/(lines/1000))
dataset1 <- within(dataset1, violations <- violations/(lines/1000))
dataset1 <- within(dataset1, duplicated_lines <- duplicated_lines/(lines/1000))
dataset1 <- within(dataset1, open_issues <- open_issues/(lines/1000))

dataset1 <- dataset1[, 1:12]



summary(dataset1)
##    project            version             commits           committers     
##  Length:156         Length:156         Min.   :0.000000   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:0.001144   1st Qu.:0.02349  
##  Mode  :character   Mode  :character   Median :0.006117   Median :0.05650  
##                                        Mean   :0.023112   Mean   :0.08222  
##                                        3rd Qu.:0.022821   3rd Qu.:0.11896  
##                                        Max.   :0.358338   Max.   :0.32386  
##  changes_by_commit  committers_weight      bugs          code_smells     
##  Min.   :    0.00   Min.   :0.00000   Min.   :0.00000   Min.   :  0.000  
##  1st Qu.:   48.84   1st Qu.:0.02117   1st Qu.:0.00000   1st Qu.:  2.174  
##  Median :  178.56   Median :0.06510   Median :0.07789   Median :  5.738  
##  Mean   :  946.46   Mean   :0.23019   Mean   :0.31509   Mean   : 12.283  
##  3rd Qu.:  428.46   3rd Qu.:0.23180   3rd Qu.:0.63989   3rd Qu.: 17.862  
##  Max.   :34902.00   Max.   :1.00000   Max.   :1.38801   Max.   :106.491  
##    complexity       violations       duplicated_lines   open_issues      
##  Min.   :  1026   Min.   :  0.6145   Min.   :  2.548   Min.   :  0.6145  
##  1st Qu.:  2580   1st Qu.:  2.5552   1st Qu.:  8.982   1st Qu.:  2.5513  
##  Median :  4026   Median :  6.4159   Median : 14.523   Median :  5.3882  
##  Mean   : 12043   Mean   : 14.1213   Mean   : 26.695   Mean   : 13.8915  
##  3rd Qu.: 11648   3rd Qu.: 24.5530   3rd Qu.: 28.229   3rd Qu.: 24.5530  
##  Max.   :143551   Max.   :106.4907   Max.   :157.694   Max.   :106.4907

Including Plots

First we analyse commits/committers relationship

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(dataset1, aes(commits, project)) +
    geom_boxplot(fill="gray")
p

p <- ggplot(dataset1, aes(committers, project)) +
    geom_boxplot(fill="gray")
p

sp <- ggplot(dataset1, aes(x=commits, y=committers)) +
   geom_point(aes(colour=project)) +
   stat_density_2d(aes(fill = ..level..), geom="polygon", alpha=0.2) + scale_fill_gradient(low="green", high="red")

sp + theme_classic()

zoom_sp <- sp + coord_cartesian(xlim = c(0, 2), ylim = c(0, 0.2))
zoom_sp + theme_classic()

kd <- with(dataset1, MASS::kde2d(committers, commits, n = 50))
fig <- plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()

fig

As preliminary analysis, we compute correlation values and draw a matrix of scatter plots:

dataset_only_data<-select(dataset1, -1, -2)
M <- cor(dataset_only_data)
plot(dataset_only_data)

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
corrgram(dataset_only_data, order=FALSE, lower.panel=panel.shade,
   upper.panel=panel.pie, text.panel=panel.txt,
   main="correlation between variables")

library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
corrplot(M, method = "circle")

corrplot(M, method = "ellipse")

corrplot(M, method = "number")

col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))
res1 <- cor.mtest(dataset_only_data, conf.level = .95)
corrplot(M, method = "color", col = col(200),
          type = "upper", order = "original", number.cex = .8,
          addCoef.col = "black", # Add coefficient of correlation
          tl.col = "black", tl.srt = 90, # Text label color and rotation
          # Combine with significance
          p.mat = res1$p, sig.level = 0.05, insig = "blank", 
          # hide correlation coefficient on the principal diagonal
          diag = FALSE)

We focus on some variables where we observe certain correlation. First, we observe the behaviour of commits against complexity

library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
sonarqube<-dataset1[(dataset1[,'project']=='sonarqube'), 1:12]
sp <- ggplot(sonarqube, aes(x=commits, y=bugs)) + 
    geom_point(shape=16, aes(colour=project)) +
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey") +
    theme(legend.position = "bottom") +
    stat_cor(method = "pearson")
sp
## `geom_smooth()` using formula 'y ~ x'

jacoco<-dataset1[(dataset1[,'project']=='jacoco'), 1:12]
sp <- ggplot(jacoco, aes(x=commits, y=code_smells)) + 
   geom_point(shape=16, aes(colour=project)) +
   geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey") + 
   stat_cor(method = "pearson") + 
   theme(legend.position = "bottom") 
sp
## `geom_smooth()` using formula 'y ~ x'

monica<-dataset1[(dataset1[,'project']=='monica'), 1:12]
sp <- ggplot(monica, aes(x=changes_by_commit, y=code_smells)) + 
      geom_point(shape=16, aes(colour=project))+
      geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey") + 
      stat_cor(method = "pearson") +
      theme(legend.position = "bottom")
sp
## `geom_smooth()` using formula 'y ~ x'

ant<-dataset1[(dataset1[,'project']=='Ant-Media-Server'), 1:12]
sp <- ggplot(ant, aes(x=committers, y=complexity)) + 
    geom_point(shape=16, aes(colour=project)) +
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey") +
    stat_cor(method = "pearson") +
    theme(legend.position = "bottom") 
sp
## `geom_smooth()` using formula 'y ~ x'

 ggplot(dataset1, aes(x=commits, y=complexity)) +
   geom_point(aes(colour=project))

 ggplot(dataset1, aes(x=commits, y=complexity, colour=project)) + 
   geom_point(shape=16)+
   geom_smooth(se = FALSE, method = lm)
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1, aes(x=commits, y=complexity)) + 
   geom_point(shape=16, aes(colour=project))+
   geom_smooth(method=lm,  linetype="dashed",
              color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

zoom_sp <- sp + coord_cartesian(xlim = c(0, 0.2), ylim = c(-20000, 20000))
zoom_sp
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
   geom_point(aes(colour=project))

sp + geom_density_2d()

sp + stat_density_2d(aes(fill = ..level..), geom="polygon") + scale_fill_gradient(low="green", high="red")

committers_density <- ggplot(dataset1, aes(x=committers, fill=project)) + 
   geom_density(aes(group = project, 
                     colour = project, 
                     fill = project),
                     alpha=.1) + 
   theme(legend.position = "right")
committers_density

zoom_sp <- committers_density + coord_cartesian(xlim = c(0, 0.10), ylim = c(0, 75))
zoom_sp

commits_density <- ggplot(dataset1, aes(x=commits, fill=project)) + 
   geom_density(aes(group = project, 
                     colour = project, 
                     fill = project),
                     alpha=.1) + 
   theme(legend.position = "right")
commits_density

zoom_sp <- commits_density + coord_cartesian(xlim = c(0, 0.3), ylim = c(0, 10))
zoom_sp

We carry out a hierarchical clustering with all the variables and take 4 clusters

ddata1 <- dist(dataset_only_data)
gdata1 <- hclust(ddata1, method = "centroid")
plot(gdata1, sub = "example", xlab = "cases", ylab = "high") 
rect.hclust(tree = gdata1, k = 4,  border = c("red", "blue", "green", "orange"))

clusters <- cutree(tree = gdata1, k = 4)
clusters
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4
## [149] 4 1 1 1 1 1 1 1
dataset_clusters <- dataset_only_data
dataset_clusters$cluster <- factor(clusters)

##K-means scaled values We carry out a K-means clustering with all the variables scaled and considering 4 clusters

library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
#Method for determine best number of clusters in K-means. Look for a bend or elbow in the sum of squared error (SSE) scree plot

mydata <- dataset_only_data
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
  for (i in 2:10) wss[i] <- sum(kmeans(mydata,
                                       centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
     ylab="Within groups sum of squares")

zdata1 <- scale(dataset_only_data)
kcdata1 <- kmeans(x = zdata1, centers = 4)
kcdata1$cluster
##   [1] 4 4 4 2 4 4 4 4 2 2 4 2 2 4 2 2 4 2 2 2 2 2 4 2 2 4 4 4 2 4 4 2 2 2 2 2 2
##  [38] 4 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 4 2 4 4 4 4 4 4 2 2 2 2 4 4
##  [75] 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2 4 2 1 1
## [149] 1 3 3 3 3 3 3 3
clusplot(zdata1, kcdata1$cluster, color = TRUE, shade = TRUE, 
         labels = 2, lines = 0)

dataset_clusters$cluster2 <- factor(kcdata1$cluster)

dataset1_cluster <- dataset1
dataset1_cluster$cluster <- factor(kcdata1$cluster)

We performed the characterization of clusters for the k-means algorithm

par(mfrow=c(1,1))
library(lattice)
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
splom(~ dataset_clusters[1:9], groups = cluster2, data = dataset_clusters, pch = 16)

library(vioplot)
## Warning: package 'vioplot' was built under R version 3.6.3
## Loading required package: sm
## Warning: package 'sm' was built under R version 3.6.3
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster2==4]

# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Code smells per cluster")

x1 <- dataset_clusters$commits[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster2==4]

# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Commits per cluster")

We compute correlation and scatter plots for clusters

library(corrplot)

c1<-dataset_clusters[(dataset_clusters[,'cluster2']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster2']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster2']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster2']=='4'), 1:10]

corrplot(cor(c1), method="number")

corrplot(cor(c2), method="number")

corrplot(cor(c3), method="number")

corrplot(cor(c4), method="number")

col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))

res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

sp <- ggplot(dataset1_cluster, aes(x=commits, y=code_smells, colour=cluster, shape=cluster)) + 
  geom_point()+
 geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")+
   coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 45))
sp
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1_cluster, aes(x=committers, y=code_smells, colour=cluster, shape=cluster)) + 
  geom_point()+
 geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey") +
   coord_cartesian(xlim = c(0, 0.3), ylim = c(0, 50))
sp
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1_cluster, aes(x=commits, y=complexity, colour=cluster, shape=cluster)) + 
  geom_point()+
 geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

sp <- ggplot(dataset1_cluster, aes(x=committers, y=complexity, colour=cluster, shape=cluster)) + 
  geom_point()+
 geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

Some 3d plots with correlations of several measures by cluster

##Ploting for sonarqube project, cluster 2 and 3 differences

Ploting for sonarqube project, cluster 2 and 3 differences

library(ggplot2)
library(ggpubr)
theme_set(theme_minimal())

dataset1_cluster
## # A tibble: 156 x 13
##    project version commits committers changes_by_comm~ committers_weig~  bugs
##    <chr>   <chr>     <dbl>      <dbl>            <dbl>            <dbl> <dbl>
##  1 monica  master  0.0227      0.112             178.            0.0151 0.576
##  2 monica  2.16.0  0.0306      0.254             228.            0.0091 0.553
##  3 monica  2.15.2  0.0222      0.198             180.            0.0153 0.638
##  4 monica  2.15.1  0.00119     0.0154             10.8           1      1.39 
##  5 monica  2.15.0  0.0633      0.324             136.            0.0059 1.39 
##  6 monica  2.13.0  0.0116      0.164             178.            0.0287 0.738
##  7 monica  2.12.1  0.0128      0.165             225.            0.0351 0.760
##  8 monica  2.12.0  0.0293      0.199            2097.            0.0132 0.764
##  9 monica  2.11.2  0.00116     0.0361             93.5           0.221  0.553
## 10 monica  2.11.1  0.00233     0.0362           1150.            0.147  0.555
## # ... with 146 more rows, and 6 more variables: code_smells <dbl>,
## #   complexity <dbl>, violations <dbl>, duplicated_lines <dbl>,
## #   open_issues <dbl>, cluster <fct>
ant<-dataset1_cluster[(dataset1_cluster[,'project']=='Ant-Media-Server'), 1:13]


p_code_smells<-ggplot(dat = ant, aes(x=version, y=code_smells)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
   theme(axis.text.x=element_blank())
p_code_smells

p_bugs<-ggplot(dat = ant, aes(x=version, y=bugs)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
   theme(axis.text.x=element_blank())
p_bugs

p_cloning<-ggplot(dat = ant, aes(x=version, y=duplicated_lines)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
   theme(axis.text.x=element_blank())
p_cloning

p_violations<-ggplot(dat = ant, aes(x=version, y=violations)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
   theme(axis.text.x=element_blank())
p_violations

p_committers<-ggplot(dat = ant, aes(x=version, y=committers)) + 
   geom_line(aes(group=1)) + 
   geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3) +
   theme(axis.text.x=element_blank())

p_committers

figure <- ggarrange(p_code_smells, p_cloning, p_committers, labels = c("a", "b", "c"), ncol = 1, nrow = 3) +  theme(axis.text.x=element_blank())
figure

##k-means for normalized values

we perform the kmeans algorithm with normalized values and euclidean distance

library(vegan)
## Warning: package 'vegan' was built under R version 3.6.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.6.3
## Registered S3 methods overwritten by 'vegan':
##   method         from      
##   reorder.hclust seriation 
##   rev.hclust     dendextend
## This is vegan 2.5-6
library(permute)

#data normalization
spe.norm <- decostand(dataset_only_data, "normalize") 
spe.ch <- vegdist(spe.norm, "euc")

spe.ch.ward <- hclust(spe.ch, method = "ward.D") 
plot(spe.ch.ward, sub = "Ward method")

#Calinski method
spe.KM.cascade <- cascadeKM(spe.norm, inf.gr = 2, sup.gr = 10, iter = 1000, criterion = "ssi")
spe.KM.cascade$results
##       2 groups  3 groups   4 groups   5 groups   6 groups   7 groups   8 groups
## SSE 2.30746283 0.6965684 0.45755324 0.32813874 0.19934571 0.14242250 0.11631444
## ssi 0.02105365 0.0220436 0.02226886 0.02693511 0.02217787 0.02252665 0.02247974
##       9 groups  10 groups
## SSE 0.09939645 0.09112660
## ssi 0.02574863 0.02797884
plot(spe.KM.cascade, sortg = TRUE)

#Silhouette plot
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
dissE <- daisy(spe.norm) 
sk <- silhouette(spe.kmeans$cl, dissE) 
plot(sk)

#compute k-means
set.seed(1) 
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)

#clusters plot
spebc.ward.g <- cutree(spe.ch.ward,k = 4)
table(spe.kmeans$cluster, spebc.ward.g)
##    spebc.ward.g
##      1  2  3  4
##   1  0  0 11  0
##   2 33 76  0  0
##   3 24  0  7  0
##   4  0  0  0  5
clusplot(spe.norm, spe.kmeans$cluster, color = TRUE, shade = TRUE, 
         labels = 2, lines = 0)

dataset_clusters$cluster3 <- factor(spe.kmeans$cluster)

We performed the characterization of clusters for the k-means algorithm

par(mfrow=c(1,1))
library(lattice)
splom(~ dataset_clusters[1:9], groups = cluster3, data = dataset_clusters, pch = 16)

library(vioplot)

x1 <- dataset_clusters$code_smells[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster3==4]

# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Code smells per cluster")

x1 <- dataset_clusters$commits[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster3==4]

# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")

title("Commits per cluster")

We compute correlation and scatter plots for clusters

c1<-dataset_clusters[(dataset_clusters[,'cluster3']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster3']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster3']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster3']=='4'), 1:10]

corrplot(cor(c1), method="number")

corrplot(cor(c2), method="number")

corrplot(cor(c3), method="number")

corrplot(cor(c4), method="number")

col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))


res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
   type = "upper", order = "original", number.cex = .8,
   addCoef.col = "black",  tl.col = "black", tl.srt = 90,
   p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)

sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster3, shape=cluster3)) + 
    geom_point()+
    geom_smooth(method=lm,  linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'

Some 3d plots with correlations of several measures by cluster